CT27 ST3D PLAC1 KD Bulk RNAseq analyses
Environment Setup
salloc -N 1 --exclusive -p amd -t 8:00:00
conda activate star
# working dir
mkdir -p /work/LAS/geetu-lab/arnstrm/hTS_CT27_ST3D_PLAC1_KD
cd /work/LAS/geetu-lab/arnstrm/hTS_CT27_ST3D_PLAC1_KD
mkdir -p 1_data
mkdir -p 2_fastqc
mkdir -p 3_STAR-mapping
mkdir -p 4_featureCounts
mkdir -p 5_multiqc
# file structure
tree -L 1
.
├── 1_data
├── 2_fastqc
├── 3_STAR-mapping
├── 4_featureCounts
└── 5_multiqcRaw data
Raw data was downloaded from the LSS using rsync
command.
cd 1_data
rsync -avP /lss/folder/path/hTS_CT27_ST3D_PLAC1_KD ./1_data
# GEO link will be included laterGenome/annotation
Additional files required for the analyses were downloaded from GenCode. The downloaded files are as follows:
cd 3_STAR-mapping
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/GRCh38.p13.genome.fa.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.primary_assembly.annotation.gff3.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.primary_assembly.annotation.gtf.gz
gunzip GRCh38.p13.genome.fa.gz
gunzip gencode.v42.primary_assembly.annotation.gtf.gz
gunzip gencode.v42.primary_assembly.annotation.gff3.gz
# ids for prot coding genes
awk '$3=="gene" && $9 ~/gene_type=protein_coding;/ {
split($9,a,";"); print gensub(/gene_id=/, "", 1, a[2])
}' gencode.v42.primary_assembly.annotation.gff3 > GRCh38.protein_coding
# you should have 20,054 ids (protein-coding genes)FastQC
Quality inspection of the reads. The multiqc report,
collating all samples together are provided as html file.
cd 2_fastqc
for fq in ../1_data/*.fq.gz; do
fastqc --threads $SLURM_JOB_CPUS_PER_NODE $fq;
doneMapping
To index the genome, following command was run (in an interactive session).
fastaGenome="GRCh38.p13.genome.fa"
gtf="gencode.v42.primary_assembly.annotation.gtf"
STAR --runThreadN $SLURM_JOB_CPUS_PER_NODE \
--runMode genomeGenerate \
--genomeDir $(pwd) \
--genomeFastaFiles $fastaGenome \
--sjdbGTFfile $gtf \
--sjdbOverhang 1Each fastq file was mapped to the indexed genome as
using runSTAR_map.sh script shown below:
#!/bin/bash
conda activate star
index=/work/LAS/geetu-lab/arnstrm/GRCh38_index
read1=$1
read2=$(echo ${read1} | sed 's/_R1_/_R2_/g')
cpus=${SLURM_JOB_CPUS_PER_NODE}
out=$(basename ${read1} | cut -f 1-2 -d "_")
STAR \
--runThreadN ${cpus} \
--genomeDir ${index} \
--outSAMtype BAM SortedByCoordinate \
--quantMode GeneCounts \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 \
--outFileNamePrefix ${out}_ \
--readFilesCommand zcat \
--outWigType bedGraph \
--outWigStrand Unstranded \
--outWigNorm RPM \
--readFilesIn ${read1} ${read2}Mapping was run with a simple loop:
for fq in *_R1_*fastq.gz; do
runSTAR_map.sh $fq;
doneCounts
For generating counts from the mapped reads, we used
subread package program featureCounts. All bam
files were supplied together to generate a single count file for
individual samples.
cd 3_STAR-mapping
realpath *.bam > ../4_featureCounts/bam.fofn
cd ../4_featureCounts
gtf="gencode.v42.primary_assembly.annotation.gtf"
while read line; do
ln -s $line;
done
featureCounts \
-T ${SLURM_CPUS_ON_NODE} \
-a ${gtf} \
-t exon \
-g gene_id \
-p \
-B \
--countReadPairs \
-o merged_counts.txt \
--tmpDir ./tmp *.bamThe generated counts file was processed to use it direclty with
DESeq2
grep -v "^#" merged_counts.txt |\
cut -f 1,7- |\
sed 's/_Aligned.sortedByCoord.out.bam//g' > merged_clean-counts.txt
head -n 1 merged_clean-counts.txt > header
grep -Fw -f GRCh38.protein_coding merged_clean-counts.txt > body
cat header body > counts_genes.tsv
rm body headCreate a info file:
head -n 1 counts_genes.tsv |\
tr "\t" "\n" |\
grep -v "^Geneid" |\
awk '{print $1"\t"gensub(/..$/, "", 1,$1)}' > info.tsvDifferential expression analysis
Differential expression (DE) analyses using DESeq2 was
performed as shown below.
Prerequisites
R packages required for this section are loaded
# set path
setwd("/work/LAS/geetu-lab/arnstrm/hTS_CT27_ST3D_PLAC1_KD")
# load the modules
library(tidyverse)
library(DESeq2)
library(vsn)
library(pheatmap)
library(ggrepel)
library(RColorBrewer)
library(reshape2)
require(biomaRt)
library(TissueEnrich)
library(plotly)
library(cowplot)
library(biomaRt)
library(scales)
library(kableExtra)
library(htmlwidgets)
library(DT)
library(enrichR)Import datasets
The counts data and its associated metadata
(coldata) are imported for analyses.
countsFile = 'assets/counts_genes.tsv'
groupFile = 'assets/info.tsv'
coldata <-
read.csv(
groupFile,
row.names = 1,
sep = "\t",
header = FALSE,
stringsAsFactors = TRUE
)
colnames(coldata) <- "condition"
cts <- as.matrix(read.delim(countsFile, row.names = 1, header = TRUE))Reorder columns of cts according to coldata
rows. Check if samples in both files match.
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]DESeq2
The batch corrected read counts are then used for running DESeq2 analyses
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ condition)
vsd <- vst(dds, blind = FALSE)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)PCA plot for QC
PCA plot for the dataset that includes all libraries.
rv <- rowVars(assay(vsd))
select <-
order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
intgroup = "condition"
intgroup.df <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
group <- if (length(intgroup) == 1) {
factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
d <- data.frame(
PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
intgroup.df,
name = colnames(vsd)
)
ggplot(d, aes(PC1, PC2, color = condition)) +
scale_shape_manual(values = 1:12) +
scale_color_manual(values = c('PLAC1.4' = '#00b462',
'SCR' = '#980c80')) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
geom_text_repel(aes(label = name)) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))Figure 4: PCA plot for the first 2 principal components
Sample distance for QC
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(vsd)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)Figure 5: Euclidean distance between samples
Set contrasts and find DE genes
res.PLCvsSCR <-
results(dds,
contrast = c(
"condition",
"PLAC1.4",
"SCR"))Functions
Import functions for processing and plotting
source("/work/LAS/geetu-lab/arnstrm/hTS_CT27_ST3D_PLAC1_KD/assets/de_functions.R")Gene information
ensembl = useMart("ENSEMBL_MART_ENSEMBL")
listDatasets(ensembl) %>%
filter(str_detect(description, "Human"))
ensembl = useDataset("hsapiens_gene_ensembl", mart = ensembl)
listFilters(ensembl) %>%
filter(str_detect(name, "ensembl"))
filterType <- "ensembl_gene_id_version"
head(rownames(counts))
counts <- read.delim("assets/counts_genes.tsv", row.names = 1, header = TRUE)
head(rownames(counts))
filterValues <- rownames(counts)
listAttributes(ensembl) %>%
head(20)
attributeNames <- c('ensembl_gene_id_version',
'ensembl_gene_id',
'external_gene_name')
annot <- getBM(
attributes = attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl
)
attributeNames <- c('ensembl_gene_id_version',
'gene_biotype',
'external_gene_name',
'description')
mart <- getBM(
attributes = attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl
)
write_delim(
annot,
file = "assets/annot.tsv",
delim = "\t"
)
write_delim(
mart,
file = "assets/mart.tsv",
delim = "\t"
) Files were saved, so we don’t query BioMart everytime we run the markdown. The files will be loaded, instead
mart <-
read.csv(
"assets/mart.tsv",
sep = "\t",
header = TRUE,
)
annot <-
read.csv(
"assets/annot.tsv",
sep = "\t",
header = TRUE,
)Results
Write files
processDE(res.PLCvsSCR, "PLAC1vsSCR")Volcano plots
PLAC1 vs. SCR
g <- volcanoPlots(
res.PLCvsSCR,
"PLAC1vsSCR",
"PLAC1",
"SCR",
"#00b462",
"#4d4d4d",
"#980c80",
ChartTitle = "PLAC1 vs. SCR"
)
g
#> Warning: Removed 1992 rows containing missing values (`geom_point()`).
#> Warning: Removed 712 rows containing missing values (`geom_text_repel()`).
#> Warning: ggrepel: 628 unlabeled data points (too many overlaps). Consider
#> increasing max.overlapsFig X: PLAC1 vs SCR
PLAC1 vs. SCR (interactive)
ggplotly(g)
#> Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issuesFig X: PLAC1 vs. SCR
enrichR
setEnrichrSite("Enrichr")
websiteLive <- TRUE
myDBs <-
c(
"DisGeNET",
"GO_Biological_Process_2021",
"HDSigDB_Human_2021",
"KEGG_2021_Human",
"GTEx_Tissues_V8_2023",
"MGI_Mammalian_Phenotype_Level_3",
"MGI_Mammalian_Phenotype_Level_4_2021",
"WikiPathways_2019_Human",
"Panther_2015"
)
if (websiteLive) {
PLAC1vsSCR.up.enriched <- enrichr(PLAC1vsSCR.up.pce2, myDBs)
PLAC1vsSCR.dw.enriched <- enrichr(PLAC1vsSCR.dw.pce2, myDBs)
}
#> Uploading data to Enrichr... Done.
#> Querying DisGeNET... Done.
#> Querying GO_Biological_Process_2021... Done.
#> Querying HDSigDB_Human_2021... Done.
#> Querying KEGG_2021_Human... Done.
#> Querying GTEx_Tissues_V8_2023... Done.
#> Querying MGI_Mammalian_Phenotype_Level_3... Done.
#> Querying MGI_Mammalian_Phenotype_Level_4_2021... Done.
#> Querying WikiPathways_2019_Human... Done.
#> Querying Panther_2015... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying DisGeNET... Done.
#> Querying GO_Biological_Process_2021... Done.
#> Querying HDSigDB_Human_2021... Done.
#> Querying KEGG_2021_Human... Done.
#> Querying GTEx_Tissues_V8_2023... Done.
#> Querying MGI_Mammalian_Phenotype_Level_3... Done.
#> Querying MGI_Mammalian_Phenotype_Level_4_2021... Done.
#> Querying WikiPathways_2019_Human... Done.
#> Querying Panther_2015... Done.
#> Parsing results... Done.PCE tests (up in PLAC1)
VentoTormo
plotPCE(inputGenes = PLAC1vsSCR.up.pce1, dataBG = teVt, cellInfo = md.vt, myColor = "#00b462")
#> Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.Xiang
plotPCE(inputGenes = PLAC1vsSCR.up.pce2, dataBG = teXi, cellInfo = md.xi, myColor = "#00b462")Zhou&Petropoulos (Castel)
plotPCE(inputGenes = PLAC1vsSCR.up.pce2, dataBG = teZp, cellInfo = md.zp, myColor = "#00b462")Rostovskaya (Hs)
plotPCE(inputGenes = PLAC1vsSCR.up.pce2, dataBG = teRo.hs, cellInfo = md.ro, myColor = "#00b462")Rostovskaya (Cy)
plotPCE(inputGenes = PLAC1vsSCR.up.pce2, dataBG = teRo.cy, cellInfo = md.ro, myColor = "#00b462")PCE tests (up in SCR)
VentoTormo
plotPCE(inputGenes = PLAC1vsSCR.dw.pce1, dataBG = teVt, cellInfo = md.vt, myColor = "#980c80")Xiang
plotPCE(inputGenes = PLAC1vsSCR.dw.pce2, dataBG = teXi, cellInfo = md.xi, myColor = "#980c80")Zhou&Petropoulos (Castel)
plotPCE(inputGenes = PLAC1vsSCR.dw.pce2, dataBG = teZp, cellInfo = md.zp, myColor = "#980c80")Rostovskaya (Hs)
plotPCE(inputGenes = PLAC1vsSCR.dw.pce2, dataBG = teRo.hs, cellInfo = md.ro, myColor = "#980c80")Rostovskaya (Cy)
plotPCE(inputGenes = PLAC1vsSCR.dw.pce2, dataBG = teRo.cy, cellInfo = md.ro, myColor = "#980c80")Enrichment tests (up in PLAC1)
DisGeNET
plotEnrichR(PLAC1vsSCR.up.enriched, table="DisGeNET" , myColor = "#00b462")GO_Biological_Process_2021
plotEnrichR(PLAC1vsSCR.up.enriched, table="GO_Biological_Process_2021" , "#00b462")HDSigDB_Human_2021
plotEnrichR(PLAC1vsSCR.up.enriched, table="HDSigDB_Human_2021" , "#00b462")KEGG_2021_Human
plotEnrichR(PLAC1vsSCR.up.enriched, table="KEGG_2021_Human" , "#00b462")GTEx_Tissues_V8_2023
plotEnrichR(PLAC1vsSCR.up.enriched, table="GTEx_Tissues_V8_2023" , "#00b462")MGI_Mammalian_Phenotype_Level_3
plotEnrichR(PLAC1vsSCR.up.enriched, table="MGI_Mammalian_Phenotype_Level_3" , "#00b462")MGI_Mammalian_Phenotype_Level_4_2021
plotEnrichR(PLAC1vsSCR.up.enriched, table="MGI_Mammalian_Phenotype_Level_4_2021" , "#00b462")WikiPathways_2019_Human
plotEnrichR(PLAC1vsSCR.up.enriched, table="WikiPathways_2019_Human" , "#00b462")Panther_2015
plotEnrichR(PLAC1vsSCR.up.enriched, table="Panther_2015" , "#00b462")Enrichment tests (up in SCR)
DisGeNET
plotEnrichR(PLAC1vsSCR.dw.enriched, table="DisGeNET" , myColor = "#980c80")GO_Biological_Process_2021
plotEnrichR(PLAC1vsSCR.dw.enriched, table="GO_Biological_Process_2021" , "#980c80")HDSigDB_Human_2021
plotEnrichR(PLAC1vsSCR.dw.enriched, table="HDSigDB_Human_2021" , "#980c80")KEGG_2021_Human
plotEnrichR(PLAC1vsSCR.dw.enriched, table="KEGG_2021_Human" , "#980c80")GTEx_Tissues_V8_2023
plotEnrichR(PLAC1vsSCR.dw.enriched, table="GTEx_Tissues_V8_2023" , "#980c80")MGI_Mammalian_Phenotype_Level_3
plotEnrichR(PLAC1vsSCR.dw.enriched, table="MGI_Mammalian_Phenotype_Level_3" , "#980c80")MGI_Mammalian_Phenotype_Level_4_2021
plotEnrichR(PLAC1vsSCR.dw.enriched, table="MGI_Mammalian_Phenotype_Level_4_2021" , "#980c80")WikiPathways_2019_Human
plotEnrichR(PLAC1vsSCR.dw.enriched, table="WikiPathways_2019_Human" , "#980c80")Panther_2015
plotEnrichR(PLAC1vsSCR.dw.enriched, table="Panther_2015" , "#980c80")Save RData
Saving the entire session as well as the DEseq2 object (as
rds) for doing the overlap analyses.
save.image(file = "assets/hTS_CT27_ST3D_PLAC1_KD.RData")
save(PLAC1vsSCR.up.pce1, PLAC1vsSCR.dw.pce1, file = "assets/hTS_CT27_ST3D_PLAC1_KD_genelists.RData")MultiQC report:
NOT YET MultiQC report is available at this link
Session Information
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] enrichR_3.1 DT_0.27
#> [3] htmlwidgets_1.5.4 kableExtra_1.3.4
#> [5] scales_1.2.1 cowplot_1.1.1
#> [7] plotly_4.10.1 TissueEnrich_1.18.0
#> [9] GSEABase_1.60.0 graph_1.76.0
#> [11] annotate_1.76.0 XML_3.99-0.12
#> [13] AnnotationDbi_1.60.0 ensurer_1.1
#> [15] biomaRt_2.54.1 reshape2_1.4.4
#> [17] RColorBrewer_1.1-3 ggrepel_0.9.3
#> [19] pheatmap_1.0.12 vsn_3.66.0
#> [21] DESeq2_1.38.3 SummarizedExperiment_1.28.0
#> [23] Biobase_2.58.0 MatrixGenerics_1.10.0
#> [25] matrixStats_0.63.0 GenomicRanges_1.50.1
#> [27] GenomeInfoDb_1.34.3 IRanges_2.32.0
#> [29] S4Vectors_0.36.0 BiocGenerics_0.44.0
#> [31] forcats_0.5.2 stringr_1.5.0
#> [33] dplyr_1.0.10 purrr_1.0.1
#> [35] readr_2.1.3 tidyr_1.3.0
#> [37] tibble_3.1.8 ggplot2_3.4.0
#> [39] tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.4.1 backports_1.4.1 BiocFileCache_2.6.1
#> [4] systemfonts_1.0.4 plyr_1.8.8 lazyeval_0.2.2
#> [7] crosstalk_1.2.0 BiocParallel_1.32.1 digest_0.6.30
#> [10] htmltools_0.5.3 fansi_1.0.3 magrittr_2.0.3
#> [13] memoise_2.0.1 googlesheets4_1.0.1 tzdb_0.3.0
#> [16] limma_3.54.0 Biostrings_2.66.0 modelr_0.1.10
#> [19] vroom_1.6.0 svglite_2.1.0 timechange_0.1.1
#> [22] rmdformats_1.0.4 prettyunits_1.1.1 colorspace_2.0-3
#> [25] blob_1.2.3 rvest_1.0.3 rappdirs_0.3.3
#> [28] haven_2.5.1 xfun_0.35 crayon_1.5.2
#> [31] RCurl_1.98-1.9 jsonlite_1.8.3 glue_1.6.2
#> [34] gtable_0.3.1 gargle_1.2.1 zlibbioc_1.44.0
#> [37] XVector_0.38.0 webshot_0.5.4 DelayedArray_0.24.0
#> [40] DBI_1.1.3 Rcpp_1.0.9 viridisLite_0.4.1
#> [43] xtable_1.8-4 progress_1.2.2 bit_4.0.5
#> [46] preprocessCore_1.60.2 httr_1.4.4 ellipsis_0.3.2
#> [49] farver_2.1.1 pkgconfig_2.0.3 sass_0.4.3
#> [52] dbplyr_2.2.1 locfit_1.5-9.7 utf8_1.2.2
#> [55] tidyselect_1.2.0 labeling_0.4.2 rlang_1.1.1
#> [58] munsell_0.5.0 cellranger_1.1.0 tools_4.2.2
#> [61] cachem_1.0.6 cli_3.4.1 generics_0.1.3
#> [64] RSQLite_2.2.18 broom_1.0.1 evaluate_0.18
#> [67] fastmap_1.1.0 yaml_2.3.6 knitr_1.41
#> [70] bit64_4.0.5 fs_1.5.2 KEGGREST_1.38.0
#> [73] xml2_1.3.3 compiler_4.2.2 rstudioapi_0.14
#> [76] filelock_1.0.2 curl_4.3.3 png_0.1-7
#> [79] affyio_1.68.0 reprex_2.0.2 geneplotter_1.76.0
#> [82] bslib_0.4.1 stringi_1.7.8 highr_0.9
#> [85] lattice_0.20-45 Matrix_1.5-3 vctrs_0.6.2
#> [88] pillar_1.8.1 lifecycle_1.0.3 BiocManager_1.30.19
#> [91] jquerylib_0.1.4 data.table_1.14.6 bitops_1.0-7
#> [94] R6_2.5.1 affy_1.76.0 bookdown_0.33
#> [97] codetools_0.2-18 assertthat_0.2.1 rjson_0.2.21
#> [100] withr_2.5.0 GenomeInfoDbData_1.2.9 parallel_4.2.2
#> [103] hms_1.1.2 grid_4.2.2 rmarkdown_2.18
#> [106] googledrive_2.0.0 lubridate_1.9.0